{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from requests_html import HTMLSession"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1.单一页面的爬取，试试水"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>edu</th>\n",
       "      <th>经验</th>\n",
       "      <th>薪水</th>\n",
       "      <th>时间</th>\n",
       "      <th>职称</th>\n",
       "      <th>公司地点</th>\n",
       "      <th>公司名称</th>\n",
       "      <th>链结</th>\n",
       "      <th>公司URL</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>10-20k·12薪</td>\n",
       "      <td>2020年07月14日</td>\n",
       "      <td>网络运营管理</td>\n",
       "      <td>廊坊-固安县</td>\n",
       "      <td>北京海利华科教育科技股份有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1929909489.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9322810/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>学历不限</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>5-8k·12薪</td>\n",
       "      <td>2020年07月19日</td>\n",
       "      <td>网站运营主管</td>\n",
       "      <td>重庆</td>\n",
       "      <td>重庆市沙坪坝区新远伦教育培训学校</td>\n",
       "      <td>https://www.liepin.com/job/1929353477.shtml</td>\n",
       "      <td>https://www.liepin.com/company/12172229/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>大专及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>7-10k·12薪</td>\n",
       "      <td>2020年07月19日</td>\n",
       "      <td>网站运营</td>\n",
       "      <td>长沙-雨花区</td>\n",
       "      <td>湖南象盒网络科技有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1930040579.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9949751/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>硕士及以上</td>\n",
       "      <td>5-10年</td>\n",
       "      <td>15-20k·12薪</td>\n",
       "      <td>2020年07月19日</td>\n",
       "      <td>网站运营总监</td>\n",
       "      <td>北京</td>\n",
       "      <td>喀斯玛(北京)科技有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1929920163.shtml</td>\n",
       "      <td>https://www.liepin.com/company/10100863/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>统招本科</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>15-25k·13薪</td>\n",
       "      <td>2020年07月19日</td>\n",
       "      <td>网站运营经理</td>\n",
       "      <td>深圳-龙岗区</td>\n",
       "      <td>深圳乐木骆科技有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1929491963.shtml</td>\n",
       "      <td>https://www.liepin.com/company/12177731/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>大专及以上</td>\n",
       "      <td>5-10年</td>\n",
       "      <td>8-15k·12薪</td>\n",
       "      <td>2020年07月19日</td>\n",
       "      <td>网络运营经理</td>\n",
       "      <td>重庆</td>\n",
       "      <td>彭水县鸿程商贸有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1927698605.shtml</td>\n",
       "      <td>https://www.liepin.com/company/10015511/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>大专及以上</td>\n",
       "      <td>5-10年</td>\n",
       "      <td>10-15k·12薪</td>\n",
       "      <td>2020年07月19日</td>\n",
       "      <td>网络运营总监</td>\n",
       "      <td>金华</td>\n",
       "      <td>金华婺城口腔医院</td>\n",
       "      <td>https://www.liepin.com/job/1925360765.shtml</td>\n",
       "      <td>https://www.liepin.com/company/10100661/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>统招本科</td>\n",
       "      <td>5-10年</td>\n",
       "      <td>40-60k·15薪</td>\n",
       "      <td>2020年07月19日</td>\n",
       "      <td>运营负责人（多语言网站）</td>\n",
       "      <td></td>\n",
       "      <td>某知名B2B跨境电商企业</td>\n",
       "      <td>https://www.liepin.com/a/20584255.shtml</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>大专及以上</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>8-12k·12薪</td>\n",
       "      <td>2020年07月19日</td>\n",
       "      <td>网络运营主管</td>\n",
       "      <td>南京-秦淮区</td>\n",
       "      <td>南京某体育培训机构</td>\n",
       "      <td>https://www.liepin.com/a/20730347.shtml</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>统招本科</td>\n",
       "      <td>5-10年</td>\n",
       "      <td>20-27k·15薪</td>\n",
       "      <td>2020年07月19日</td>\n",
       "      <td>儿科类门诊网络运营总监</td>\n",
       "      <td>武汉</td>\n",
       "      <td>某上市药企</td>\n",
       "      <td>https://www.liepin.com/a/21134451.shtml</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>学历不限</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>3-7k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网站运营</td>\n",
       "      <td>广州</td>\n",
       "      <td>广州大馨科技有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1930033697.shtml</td>\n",
       "      <td>https://www.liepin.com/company/12314609/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>4-6k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>泰语网站运营</td>\n",
       "      <td>合肥-高新区</td>\n",
       "      <td>澳鹏网络</td>\n",
       "      <td>https://www.liepin.com/job/1930030491.shtml</td>\n",
       "      <td>https://www.liepin.com/company/8430632/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>4-6k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>法语网站运营</td>\n",
       "      <td>南昌-高新区</td>\n",
       "      <td>澳鹏网络</td>\n",
       "      <td>https://www.liepin.com/job/1930030489.shtml</td>\n",
       "      <td>https://www.liepin.com/company/8430632/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>5-10年</td>\n",
       "      <td>15-30k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>资深网站运营/SEO专家</td>\n",
       "      <td>上海</td>\n",
       "      <td>珍岛信息技术（上海）股份有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1928412511.shtml</td>\n",
       "      <td>https://www.liepin.com/company/1902693/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>大专及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>4-6k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网站运营/电商运营</td>\n",
       "      <td>郑州-金水区</td>\n",
       "      <td>河南佳汇供应链管理有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1928081475.shtml</td>\n",
       "      <td>https://www.liepin.com/company/10117257/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>7-12k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网站运营</td>\n",
       "      <td>深圳-科技园</td>\n",
       "      <td>盈富斯</td>\n",
       "      <td>https://www.liepin.com/job/1927933933.shtml</td>\n",
       "      <td>https://www.liepin.com/company/8523739/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>15-20k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网站运营经理</td>\n",
       "      <td>上海-浦东新区</td>\n",
       "      <td>河姆渡</td>\n",
       "      <td>https://www.liepin.com/job/1927774977.shtml</td>\n",
       "      <td>https://www.liepin.com/company/8636161/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>统招本科</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>12-20k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网站运营主管</td>\n",
       "      <td>深圳-福田区</td>\n",
       "      <td>北京兰亭高创科技有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1922469633.shtml</td>\n",
       "      <td>https://www.liepin.com/company/10073645/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>大专及以上</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>5-8k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网站运营专员</td>\n",
       "      <td>五指山</td>\n",
       "      <td>五指山仁商基业有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1921122667.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9469403/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>12-16k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网站运营主管（电脑周边品类）</td>\n",
       "      <td>深圳-福田区</td>\n",
       "      <td>北京兰亭高创科技有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1921004331.shtml</td>\n",
       "      <td>https://www.liepin.com/company/10073645/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>统招本科</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>14-20k·13薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网站运营</td>\n",
       "      <td>深圳-龙华区</td>\n",
       "      <td>某知名电子商务公司</td>\n",
       "      <td>https://www.liepin.com/a/20520083.shtml</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>6-10k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网站运营推广/平台数据分析</td>\n",
       "      <td>宁波-首南</td>\n",
       "      <td>宁波渠成集团有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1911189738.shtml</td>\n",
       "      <td>https://www.liepin.com/company/8096424/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>统招本科</td>\n",
       "      <td>5-10年</td>\n",
       "      <td>25-40k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网站运营总监</td>\n",
       "      <td>上海-航头</td>\n",
       "      <td>河姆渡</td>\n",
       "      <td>https://www.liepin.com/job/194614052.shtml</td>\n",
       "      <td>https://www.liepin.com/company/8636161/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>一年以下</td>\n",
       "      <td>8-10k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网络运营经理</td>\n",
       "      <td>武汉-武昌区</td>\n",
       "      <td>武汉科思沃斯教育科技有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1929555931.shtml</td>\n",
       "      <td>https://www.liepin.com/company/12280475/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>学历不限</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>10-20k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网络运营经理</td>\n",
       "      <td>深圳</td>\n",
       "      <td>深圳市金金金黄金珠宝集团有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1928777847.shtml</td>\n",
       "      <td>https://www.liepin.com/company/12179445/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>大专及以上</td>\n",
       "      <td>5-10年</td>\n",
       "      <td>15-30k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网络运营总监</td>\n",
       "      <td>广州-白云区</td>\n",
       "      <td>广州市艾依格家居制品有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1926237249.shtml</td>\n",
       "      <td>https://www.liepin.com/company/7882143/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>大专及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>5-6k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网络运营专员</td>\n",
       "      <td>上海-嘉定区</td>\n",
       "      <td>上海青谷培训学校有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1928024771.shtml</td>\n",
       "      <td>https://www.liepin.com/company/10158297/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>大专及以上</td>\n",
       "      <td>5-10年</td>\n",
       "      <td>15-20k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网络运营经理</td>\n",
       "      <td>武汉-汉阳区</td>\n",
       "      <td>武汉美福源孕婴童文化发展有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1922544783.shtml</td>\n",
       "      <td>https://www.liepin.com/company/8748602/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>学历不限</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>25-40k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网络运营总监</td>\n",
       "      <td></td>\n",
       "      <td>北京某科技股份有限公司</td>\n",
       "      <td>https://www.liepin.com/a/21244297.shtml</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>大专及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>5-10k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网络运营专员</td>\n",
       "      <td>上海-浦东新区</td>\n",
       "      <td>开源环保科技(上海)有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1921218897.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9967783/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>5-10年</td>\n",
       "      <td>40-60k·13薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网络运营专家</td>\n",
       "      <td>深圳-南山区</td>\n",
       "      <td>XX消费金融</td>\n",
       "      <td>https://www.liepin.com/a/20635273.shtml</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>45-60k·12薪</td>\n",
       "      <td>2020年07月19日</td>\n",
       "      <td>网络教育中心运营总监副总（APP运营）线上教育运营总监</td>\n",
       "      <td>北京</td>\n",
       "      <td>全球化规模的以IP为核心的儿童全产业链科技文创平台。</td>\n",
       "      <td>https://www.liepin.com/a/21249335.shtml</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>统招本科</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>15-25k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网络运营高级经理</td>\n",
       "      <td></td>\n",
       "      <td>中国某技术研究所（北京）</td>\n",
       "      <td>https://www.liepin.com/a/21303533.shtml</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>面议</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网络技术运营工程师</td>\n",
       "      <td></td>\n",
       "      <td>字节跳动</td>\n",
       "      <td>https://www.liepin.com/job/1924504251.shtml</td>\n",
       "      <td>https://www.liepin.com/company/7863078/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>学历不限</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>8-15k·12薪</td>\n",
       "      <td>2020年07月18日</td>\n",
       "      <td>网络直播运营自由不加班</td>\n",
       "      <td>合肥-庐阳区</td>\n",
       "      <td>安徽慕娱文化传媒有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1928009023.shtml</td>\n",
       "      <td>https://www.liepin.com/company/12227503/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>统招本科</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>13-30k·12薪</td>\n",
       "      <td>2020年07月17日</td>\n",
       "      <td>数字化营销经理—SEO/网站运营方向</td>\n",
       "      <td>长沙-长沙县</td>\n",
       "      <td>三一集团</td>\n",
       "      <td>https://www.liepin.com/job/1930018325.shtml</td>\n",
       "      <td>https://www.liepin.com/company/892388/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>大专及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>4-8k·12薪</td>\n",
       "      <td>2020年07月17日</td>\n",
       "      <td>网站运营专员</td>\n",
       "      <td>东莞-长安镇</td>\n",
       "      <td>广东伟一工业互联网科技有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1929954735.shtml</td>\n",
       "      <td>https://www.liepin.com/company/12263359/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>大专及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>4-6k·13薪</td>\n",
       "      <td>2020年07月17日</td>\n",
       "      <td>B2B网站运营</td>\n",
       "      <td>福州-长乐区</td>\n",
       "      <td>德诚珠宝集团有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1929929539.shtml</td>\n",
       "      <td>https://www.liepin.com/company/7896514/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>大专及以上</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>5-10k·13薪</td>\n",
       "      <td>2020年07月17日</td>\n",
       "      <td>B2B推广网站运营人员</td>\n",
       "      <td>福州-长乐区</td>\n",
       "      <td>德诚珠宝集团有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1929928321.shtml</td>\n",
       "      <td>https://www.liepin.com/company/7896514/</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>本科及以上</td>\n",
       "      <td>经验不限</td>\n",
       "      <td>5-8k·12薪</td>\n",
       "      <td>2020年07月17日</td>\n",
       "      <td>法语运营/网站运营/法语网站运营 /电子商务</td>\n",
       "      <td>苏州-常熟</td>\n",
       "      <td>苏州赛奥科传媒有限公司</td>\n",
       "      <td>https://www.liepin.com/job/1929925597.shtml</td>\n",
       "      <td>https://www.liepin.com/company/9592537/</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      edu     经验          薪水           时间                           职称  \\\n",
       "0   本科及以上   3-5年  10-20k·12薪  2020年07月14日                       网络运营管理   \n",
       "1    学历不限   经验不限    5-8k·12薪  2020年07月19日                       网站运营主管   \n",
       "2   大专及以上   1-3年   7-10k·12薪  2020年07月19日                         网站运营   \n",
       "3   硕士及以上  5-10年  15-20k·12薪  2020年07月19日                       网站运营总监   \n",
       "4    统招本科   3-5年  15-25k·13薪  2020年07月19日                       网站运营经理   \n",
       "5   大专及以上  5-10年   8-15k·12薪  2020年07月19日                       网络运营经理   \n",
       "6   大专及以上  5-10年  10-15k·12薪  2020年07月19日                       网络运营总监   \n",
       "7    统招本科  5-10年  40-60k·15薪  2020年07月19日                 运营负责人（多语言网站）   \n",
       "8   大专及以上   3-5年   8-12k·12薪  2020年07月19日                       网络运营主管   \n",
       "9    统招本科  5-10年  20-27k·15薪  2020年07月19日                  儿科类门诊网络运营总监   \n",
       "10   学历不限   经验不限    3-7k·12薪  2020年07月18日                         网站运营   \n",
       "11  本科及以上   经验不限    4-6k·12薪  2020年07月18日                       泰语网站运营   \n",
       "12  本科及以上   经验不限    4-6k·12薪  2020年07月18日                       法语网站运营   \n",
       "13  本科及以上  5-10年  15-30k·12薪  2020年07月18日                 资深网站运营/SEO专家   \n",
       "14  大专及以上   1-3年    4-6k·12薪  2020年07月18日                    网站运营/电商运营   \n",
       "15  本科及以上   1-3年   7-12k·12薪  2020年07月18日                         网站运营   \n",
       "16  本科及以上   3-5年  15-20k·12薪  2020年07月18日                       网站运营经理   \n",
       "17   统招本科   3-5年  12-20k·12薪  2020年07月18日                       网站运营主管   \n",
       "18  大专及以上   3-5年    5-8k·12薪  2020年07月18日                       网站运营专员   \n",
       "19  本科及以上   3-5年  12-16k·12薪  2020年07月18日               网站运营主管（电脑周边品类）   \n",
       "20   统招本科   3-5年  14-20k·13薪  2020年07月18日                         网站运营   \n",
       "21  本科及以上   1-3年   6-10k·12薪  2020年07月18日                网站运营推广/平台数据分析   \n",
       "22   统招本科  5-10年  25-40k·12薪  2020年07月18日                       网站运营总监   \n",
       "23  本科及以上   一年以下   8-10k·12薪  2020年07月18日                       网络运营经理   \n",
       "24   学历不限   经验不限  10-20k·12薪  2020年07月18日                       网络运营经理   \n",
       "25  大专及以上  5-10年  15-30k·12薪  2020年07月18日                       网络运营总监   \n",
       "26  大专及以上   1-3年    5-6k·12薪  2020年07月18日                       网络运营专员   \n",
       "27  大专及以上  5-10年  15-20k·12薪  2020年07月18日                       网络运营经理   \n",
       "28   学历不限   3-5年  25-40k·12薪  2020年07月18日                       网络运营总监   \n",
       "29  大专及以上   1-3年   5-10k·12薪  2020年07月18日                       网络运营专员   \n",
       "30  本科及以上  5-10年  40-60k·13薪  2020年07月18日                       网络运营专家   \n",
       "31  本科及以上   经验不限  45-60k·12薪  2020年07月19日  网络教育中心运营总监副总（APP运营）线上教育运营总监   \n",
       "32   统招本科   3-5年  15-25k·12薪  2020年07月18日                     网络运营高级经理   \n",
       "33  本科及以上   3-5年          面议  2020年07月18日                    网络技术运营工程师   \n",
       "34   学历不限   经验不限   8-15k·12薪  2020年07月18日                  网络直播运营自由不加班   \n",
       "35   统招本科   3-5年  13-30k·12薪  2020年07月17日           数字化营销经理—SEO/网站运营方向   \n",
       "36  大专及以上   1-3年    4-8k·12薪  2020年07月17日                       网站运营专员   \n",
       "37  大专及以上   1-3年    4-6k·13薪  2020年07月17日                      B2B网站运营   \n",
       "38  大专及以上   1-3年   5-10k·13薪  2020年07月17日                  B2B推广网站运营人员   \n",
       "39  本科及以上   经验不限    5-8k·12薪  2020年07月17日       法语运营/网站运营/法语网站运营 /电子商务   \n",
       "\n",
       "       公司地点                        公司名称  \\\n",
       "0    廊坊-固安县            北京海利华科教育科技股份有限公司   \n",
       "1        重庆            重庆市沙坪坝区新远伦教育培训学校   \n",
       "2    长沙-雨花区                湖南象盒网络科技有限公司   \n",
       "3        北京               喀斯玛(北京)科技有限公司   \n",
       "4    深圳-龙岗区                 深圳乐木骆科技有限公司   \n",
       "5        重庆                 彭水县鸿程商贸有限公司   \n",
       "6        金华                    金华婺城口腔医院   \n",
       "7                          某知名B2B跨境电商企业   \n",
       "8    南京-秦淮区                   南京某体育培训机构   \n",
       "9        武汉                       某上市药企   \n",
       "10       广州                  广州大馨科技有限公司   \n",
       "11   合肥-高新区                        澳鹏网络   \n",
       "12   南昌-高新区                        澳鹏网络   \n",
       "13       上海            珍岛信息技术（上海）股份有限公司   \n",
       "14   郑州-金水区               河南佳汇供应链管理有限公司   \n",
       "15   深圳-科技园                         盈富斯   \n",
       "16  上海-浦东新区                         河姆渡   \n",
       "17   深圳-福田区                北京兰亭高创科技有限公司   \n",
       "18      五指山                 五指山仁商基业有限公司   \n",
       "19   深圳-福田区                北京兰亭高创科技有限公司   \n",
       "20   深圳-龙华区                   某知名电子商务公司   \n",
       "21    宁波-首南                  宁波渠成集团有限公司   \n",
       "22    上海-航头                         河姆渡   \n",
       "23   武汉-武昌区              武汉科思沃斯教育科技有限公司   \n",
       "24       深圳            深圳市金金金黄金珠宝集团有限公司   \n",
       "25   广州-白云区              广州市艾依格家居制品有限公司   \n",
       "26   上海-嘉定区                上海青谷培训学校有限公司   \n",
       "27   武汉-汉阳区            武汉美福源孕婴童文化发展有限公司   \n",
       "28                          北京某科技股份有限公司   \n",
       "29  上海-浦东新区              开源环保科技(上海)有限公司   \n",
       "30   深圳-南山区                      XX消费金融   \n",
       "31       北京  全球化规模的以IP为核心的儿童全产业链科技文创平台。   \n",
       "32                         中国某技术研究所（北京）   \n",
       "33                                 字节跳动   \n",
       "34   合肥-庐阳区                安徽慕娱文化传媒有限公司   \n",
       "35   长沙-长沙县                        三一集团   \n",
       "36   东莞-长安镇             广东伟一工业互联网科技有限公司   \n",
       "37   福州-长乐区                  德诚珠宝集团有限公司   \n",
       "38   福州-长乐区                  德诚珠宝集团有限公司   \n",
       "39    苏州-常熟                 苏州赛奥科传媒有限公司   \n",
       "\n",
       "                                             链结  \\\n",
       "0   https://www.liepin.com/job/1929909489.shtml   \n",
       "1   https://www.liepin.com/job/1929353477.shtml   \n",
       "2   https://www.liepin.com/job/1930040579.shtml   \n",
       "3   https://www.liepin.com/job/1929920163.shtml   \n",
       "4   https://www.liepin.com/job/1929491963.shtml   \n",
       "5   https://www.liepin.com/job/1927698605.shtml   \n",
       "6   https://www.liepin.com/job/1925360765.shtml   \n",
       "7       https://www.liepin.com/a/20584255.shtml   \n",
       "8       https://www.liepin.com/a/20730347.shtml   \n",
       "9       https://www.liepin.com/a/21134451.shtml   \n",
       "10  https://www.liepin.com/job/1930033697.shtml   \n",
       "11  https://www.liepin.com/job/1930030491.shtml   \n",
       "12  https://www.liepin.com/job/1930030489.shtml   \n",
       "13  https://www.liepin.com/job/1928412511.shtml   \n",
       "14  https://www.liepin.com/job/1928081475.shtml   \n",
       "15  https://www.liepin.com/job/1927933933.shtml   \n",
       "16  https://www.liepin.com/job/1927774977.shtml   \n",
       "17  https://www.liepin.com/job/1922469633.shtml   \n",
       "18  https://www.liepin.com/job/1921122667.shtml   \n",
       "19  https://www.liepin.com/job/1921004331.shtml   \n",
       "20      https://www.liepin.com/a/20520083.shtml   \n",
       "21  https://www.liepin.com/job/1911189738.shtml   \n",
       "22   https://www.liepin.com/job/194614052.shtml   \n",
       "23  https://www.liepin.com/job/1929555931.shtml   \n",
       "24  https://www.liepin.com/job/1928777847.shtml   \n",
       "25  https://www.liepin.com/job/1926237249.shtml   \n",
       "26  https://www.liepin.com/job/1928024771.shtml   \n",
       "27  https://www.liepin.com/job/1922544783.shtml   \n",
       "28      https://www.liepin.com/a/21244297.shtml   \n",
       "29  https://www.liepin.com/job/1921218897.shtml   \n",
       "30      https://www.liepin.com/a/20635273.shtml   \n",
       "31      https://www.liepin.com/a/21249335.shtml   \n",
       "32      https://www.liepin.com/a/21303533.shtml   \n",
       "33  https://www.liepin.com/job/1924504251.shtml   \n",
       "34  https://www.liepin.com/job/1928009023.shtml   \n",
       "35  https://www.liepin.com/job/1930018325.shtml   \n",
       "36  https://www.liepin.com/job/1929954735.shtml   \n",
       "37  https://www.liepin.com/job/1929929539.shtml   \n",
       "38  https://www.liepin.com/job/1929928321.shtml   \n",
       "39  https://www.liepin.com/job/1929925597.shtml   \n",
       "\n",
       "                                       公司URL  \n",
       "0    https://www.liepin.com/company/9322810/  \n",
       "1   https://www.liepin.com/company/12172229/  \n",
       "2    https://www.liepin.com/company/9949751/  \n",
       "3   https://www.liepin.com/company/10100863/  \n",
       "4   https://www.liepin.com/company/12177731/  \n",
       "5   https://www.liepin.com/company/10015511/  \n",
       "6   https://www.liepin.com/company/10100661/  \n",
       "7                                             \n",
       "8                                             \n",
       "9                                             \n",
       "10  https://www.liepin.com/company/12314609/  \n",
       "11   https://www.liepin.com/company/8430632/  \n",
       "12   https://www.liepin.com/company/8430632/  \n",
       "13   https://www.liepin.com/company/1902693/  \n",
       "14  https://www.liepin.com/company/10117257/  \n",
       "15   https://www.liepin.com/company/8523739/  \n",
       "16   https://www.liepin.com/company/8636161/  \n",
       "17  https://www.liepin.com/company/10073645/  \n",
       "18   https://www.liepin.com/company/9469403/  \n",
       "19  https://www.liepin.com/company/10073645/  \n",
       "20                                            \n",
       "21   https://www.liepin.com/company/8096424/  \n",
       "22   https://www.liepin.com/company/8636161/  \n",
       "23  https://www.liepin.com/company/12280475/  \n",
       "24  https://www.liepin.com/company/12179445/  \n",
       "25   https://www.liepin.com/company/7882143/  \n",
       "26  https://www.liepin.com/company/10158297/  \n",
       "27   https://www.liepin.com/company/8748602/  \n",
       "28                                            \n",
       "29   https://www.liepin.com/company/9967783/  \n",
       "30                                            \n",
       "31                                            \n",
       "32                                            \n",
       "33   https://www.liepin.com/company/7863078/  \n",
       "34  https://www.liepin.com/company/12227503/  \n",
       "35    https://www.liepin.com/company/892388/  \n",
       "36  https://www.liepin.com/company/12263359/  \n",
       "37   https://www.liepin.com/company/7896514/  \n",
       "38   https://www.liepin.com/company/7896514/  \n",
       "39   https://www.liepin.com/company/9592537/  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url = \"https://www.liepin.com/zhaopin/?key=网站运营\"\n",
    "session = HTMLSession()\n",
    "r = session.get( url )\n",
    "\n",
    "\n",
    "主要元素 = r.html.xpath( \\\n",
    "    '//ul[@class=\"sojob-list\"]/li')\n",
    "\n",
    "dict_xpaths={ \n",
    "    'text': {\n",
    "        'edu':      '//div[contains(@class,\"job-info\")]/p/span[@class=\"edu\"]',\n",
    "        '经验':      '//div[contains(@class,\"job-info\")]/p/span[@class=\"edu\"]/following-sibling::span',\n",
    "        '薪水':    '//div[contains(@class,\"job-info\")]/p/span[@class=\"text-warning\"]', \n",
    "        '时间':    '//div[contains(@class,\"job-info\")]/p/time/@title', \n",
    "        '职称':    '//div[contains(@class,\"job-info\")]/h3/a', \n",
    "        '公司地点': '//div[contains(@class,\"job-info\")]/p/a',\n",
    "        '公司名称': '//div[contains(@class,\"sojob-item-main\")]//p[@class=\"company-name\"]/a', \n",
    "    },\n",
    "    'text_content': {\n",
    "    },\n",
    "    'href': {\n",
    "        '链结':    '//div[contains(@class,\"job-info\")]/h3/a', \n",
    "        '公司URL': '//div[contains(@class,\"sojob-item-main\")]//p[@class=\"company-name\"]/a', \n",
    "    }\n",
    "}\n",
    "\n",
    "def get_e_text_content(_xpath_):\n",
    "    # 高级列表推导\n",
    "    暂存结果 = [e.xpath(_xpath_)[0].lxml.text_content() for e in 主要元素]\n",
    "    return(暂存结果)\n",
    "\n",
    "def get_e_text(_xpath_):\n",
    "    # 高级列表推导\n",
    "    暂存结果 = [\"\".join([x.strip() if type(x) is str else x.text.strip() for x in e.xpath(_xpath_)]) for e in 主要元素]\n",
    "    return(暂存结果)\n",
    "\n",
    "def get_e_href(_xpath_):\n",
    "    # 高级列表推导\n",
    "    暂存结果 = [list(e.xpath(_xpath_, first=True).absolute_links)[0] \\\n",
    "               if len(e.xpath(_xpath_, first=True).absolute_links) >= 1  \\\n",
    "               else \"\" for e in 主要元素]\n",
    "    return(暂存结果)\n",
    "\n",
    "# 只对主要元素下进行.xpath取值\n",
    "数据字典 = dict()\n",
    "\n",
    "数据字典 = {k:get_e_text_content(v) for k,v in dict_xpaths['text_content'].items()}\n",
    "数据字典.update({k:get_e_text(v) for k,v in dict_xpaths['text'].items()})\n",
    "数据字典.update({k:get_e_href(v) for k,v in dict_xpaths['href'].items()})\n",
    "\n",
    "[len(v) for k,v in 数据字典.items()]\n",
    "\n",
    "数据 = pd.DataFrame(数据字典)\n",
    "#数据.to_excel(\"Web数据挖掘_网站运营_liepin.xlsx\", sheet_name=\"搜查结果\")\n",
    "数据 "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2.解析翻页"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "url = \"https://www.liepin.com/zhaopin/?key=网站运营\"\n",
    "session = HTMLSession()\n",
    "r = session.get( url )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<Element 'a' href='/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E7%BD%91%E7%AB%99%E8%BF%90%E8%90%A5&siTag=sTT8QS_ztTHnR804xqdRDQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d0c770bbc6e7eb1a29014d7370b5332d&d_curPage=0&d_pageSize=40&d_headId=d0c770bbc6e7eb1a29014d7370b5332d&curPage=1'>, <Element 'a' href='/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E7%BD%91%E7%AB%99%E8%BF%90%E8%90%A5&siTag=sTT8QS_ztTHnR804xqdRDQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d0c770bbc6e7eb1a29014d7370b5332d&d_curPage=0&d_pageSize=40&d_headId=d0c770bbc6e7eb1a29014d7370b5332d&curPage=2'>, <Element 'a' href='/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E7%BD%91%E7%AB%99%E8%BF%90%E8%90%A5&siTag=sTT8QS_ztTHnR804xqdRDQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d0c770bbc6e7eb1a29014d7370b5332d&d_curPage=0&d_pageSize=40&d_headId=d0c770bbc6e7eb1a29014d7370b5332d&curPage=3'>, <Element 'a' href='/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E7%BD%91%E7%AB%99%E8%BF%90%E8%90%A5&siTag=sTT8QS_ztTHnR804xqdRDQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d0c770bbc6e7eb1a29014d7370b5332d&d_curPage=0&d_pageSize=40&d_headId=d0c770bbc6e7eb1a29014d7370b5332d&curPage=4'>, <Element 'a' href='/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E7%BD%91%E7%AB%99%E8%BF%90%E8%90%A5&siTag=sTT8QS_ztTHnR804xqdRDQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d0c770bbc6e7eb1a29014d7370b5332d&d_curPage=0&d_pageSize=40&d_headId=d0c770bbc6e7eb1a29014d7370b5332d&curPage=1'>, <Element 'a' class=('last',) href='/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E7%BD%91%E7%AB%99%E8%BF%90%E8%90%A5&siTag=sTT8QS_ztTHnR804xqdRDQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d0c770bbc6e7eb1a29014d7370b5332d&d_curPage=0&d_pageSize=40&d_headId=d0c770bbc6e7eb1a29014d7370b5332d&curPage=9' title='末页'>]\n",
      "{'2': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E7%BD%91%E7%AB%99%E8%BF%90%E8%90%A5&siTag=sTT8QS_ztTHnR804xqdRDQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d0c770bbc6e7eb1a29014d7370b5332d&d_curPage=0&d_pageSize=40&d_headId=d0c770bbc6e7eb1a29014d7370b5332d&curPage=1', '3': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E7%BD%91%E7%AB%99%E8%BF%90%E8%90%A5&siTag=sTT8QS_ztTHnR804xqdRDQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d0c770bbc6e7eb1a29014d7370b5332d&d_curPage=0&d_pageSize=40&d_headId=d0c770bbc6e7eb1a29014d7370b5332d&curPage=2', '4': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E7%BD%91%E7%AB%99%E8%BF%90%E8%90%A5&siTag=sTT8QS_ztTHnR804xqdRDQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d0c770bbc6e7eb1a29014d7370b5332d&d_curPage=0&d_pageSize=40&d_headId=d0c770bbc6e7eb1a29014d7370b5332d&curPage=3', '5': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E7%BD%91%E7%AB%99%E8%BF%90%E8%90%A5&siTag=sTT8QS_ztTHnR804xqdRDQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d0c770bbc6e7eb1a29014d7370b5332d&d_curPage=0&d_pageSize=40&d_headId=d0c770bbc6e7eb1a29014d7370b5332d&curPage=4', '下一页': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E7%BD%91%E7%AB%99%E8%BF%90%E8%90%A5&siTag=sTT8QS_ztTHnR804xqdRDQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d0c770bbc6e7eb1a29014d7370b5332d&d_curPage=0&d_pageSize=40&d_headId=d0c770bbc6e7eb1a29014d7370b5332d&curPage=1', '': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E7%BD%91%E7%AB%99%E8%BF%90%E8%90%A5&siTag=sTT8QS_ztTHnR804xqdRDQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d0c770bbc6e7eb1a29014d7370b5332d&d_curPage=0&d_pageSize=40&d_headId=d0c770bbc6e7eb1a29014d7370b5332d&curPage=9'}\n"
     ]
    }
   ],
   "source": [
    "xpath_翻页a = '//div[@class=\"pagerbar\"]/a' # 有disabled, current等href是javascript\n",
    "xpath_翻页a = '//div[@class=\"pagerbar\"]/a[starts-with(@href,\"/zhaopin\")]'\n",
    "print (r.html.xpath(xpath_翻页a)) # 物件\n",
    "\n",
    "href_列表 = [x.xpath('//@href')[0] for x in r.html.xpath(xpath_翻页a)]\n",
    "#print (href_列表)\n",
    "\n",
    "文字_列表 = [x.text for x in r.html.xpath(xpath_翻页a)]\n",
    "#print (文字_列表)\n",
    "\n",
    "href_字典 = {x.text:x.xpath('//@href')[0]  for x in r.html.xpath(xpath_翻页a)}\n",
    "print (href_字典)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3.构建参数模板，为后面的抓取做准备"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>scheme</th>\n",
       "      <th>netloc</th>\n",
       "      <th>path</th>\n",
       "      <th>params</th>\n",
       "      <th>query</th>\n",
       "      <th>fragment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>/zhaopin/</td>\n",
       "      <td></td>\n",
       "      <td>compkind=&amp;dqs=&amp;pubTime=&amp;pageSize=40&amp;salary=&amp;co...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>/zhaopin/</td>\n",
       "      <td></td>\n",
       "      <td>compkind=&amp;dqs=&amp;pubTime=&amp;pageSize=40&amp;salary=&amp;co...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>/zhaopin/</td>\n",
       "      <td></td>\n",
       "      <td>compkind=&amp;dqs=&amp;pubTime=&amp;pageSize=40&amp;salary=&amp;co...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>/zhaopin/</td>\n",
       "      <td></td>\n",
       "      <td>compkind=&amp;dqs=&amp;pubTime=&amp;pageSize=40&amp;salary=&amp;co...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>/zhaopin/</td>\n",
       "      <td></td>\n",
       "      <td>compkind=&amp;dqs=&amp;pubTime=&amp;pageSize=40&amp;salary=&amp;co...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>/zhaopin/</td>\n",
       "      <td></td>\n",
       "      <td>compkind=&amp;dqs=&amp;pubTime=&amp;pageSize=40&amp;salary=&amp;co...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  scheme netloc       path params  \\\n",
       "0                /zhaopin/          \n",
       "1                /zhaopin/          \n",
       "2                /zhaopin/          \n",
       "3                /zhaopin/          \n",
       "4                /zhaopin/          \n",
       "5                /zhaopin/          \n",
       "\n",
       "                                               query fragment  \n",
       "0  compkind=&dqs=&pubTime=&pageSize=40&salary=&co...           \n",
       "1  compkind=&dqs=&pubTime=&pageSize=40&salary=&co...           \n",
       "2  compkind=&dqs=&pubTime=&pageSize=40&salary=&co...           \n",
       "3  compkind=&dqs=&pubTime=&pageSize=40&salary=&co...           \n",
       "4  compkind=&dqs=&pubTime=&pageSize=40&salary=&co...           \n",
       "5  compkind=&dqs=&pubTime=&pageSize=40&salary=&co...           "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "scheme      1\n",
      "netloc      1\n",
      "path        1\n",
      "params      1\n",
      "query       5\n",
      "fragment    1\n",
      "dtype: int64\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pageSize</th>\n",
       "      <th>sortFlag</th>\n",
       "      <th>key</th>\n",
       "      <th>siTag</th>\n",
       "      <th>d_sfrom</th>\n",
       "      <th>d_ckId</th>\n",
       "      <th>d_curPage</th>\n",
       "      <th>d_pageSize</th>\n",
       "      <th>d_headId</th>\n",
       "      <th>curPage</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>40</td>\n",
       "      <td>°radeFlag=0</td>\n",
       "      <td>网站运营</td>\n",
       "      <td>sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw</td>\n",
       "      <td>search_unknown</td>\n",
       "      <td>d0c770bbc6e7eb1a29014d7370b5332d</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>d0c770bbc6e7eb1a29014d7370b5332d</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>40</td>\n",
       "      <td>°radeFlag=0</td>\n",
       "      <td>网站运营</td>\n",
       "      <td>sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw</td>\n",
       "      <td>search_unknown</td>\n",
       "      <td>d0c770bbc6e7eb1a29014d7370b5332d</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>d0c770bbc6e7eb1a29014d7370b5332d</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>40</td>\n",
       "      <td>°radeFlag=0</td>\n",
       "      <td>网站运营</td>\n",
       "      <td>sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw</td>\n",
       "      <td>search_unknown</td>\n",
       "      <td>d0c770bbc6e7eb1a29014d7370b5332d</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>d0c770bbc6e7eb1a29014d7370b5332d</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>40</td>\n",
       "      <td>°radeFlag=0</td>\n",
       "      <td>网站运营</td>\n",
       "      <td>sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw</td>\n",
       "      <td>search_unknown</td>\n",
       "      <td>d0c770bbc6e7eb1a29014d7370b5332d</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>d0c770bbc6e7eb1a29014d7370b5332d</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>40</td>\n",
       "      <td>°radeFlag=0</td>\n",
       "      <td>网站运营</td>\n",
       "      <td>sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw</td>\n",
       "      <td>search_unknown</td>\n",
       "      <td>d0c770bbc6e7eb1a29014d7370b5332d</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>d0c770bbc6e7eb1a29014d7370b5332d</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>40</td>\n",
       "      <td>°radeFlag=0</td>\n",
       "      <td>网站运营</td>\n",
       "      <td>sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw</td>\n",
       "      <td>search_unknown</td>\n",
       "      <td>d0c770bbc6e7eb1a29014d7370b5332d</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>d0c770bbc6e7eb1a29014d7370b5332d</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  pageSize     sortFlag   key                                          siTag  \\\n",
       "0       40  °radeFlag=0  网站运营  sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw   \n",
       "1       40  °radeFlag=0  网站运营  sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw   \n",
       "2       40  °radeFlag=0  网站运营  sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw   \n",
       "3       40  °radeFlag=0  网站运营  sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw   \n",
       "4       40  °radeFlag=0  网站运营  sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw   \n",
       "5       40  °radeFlag=0  网站运营  sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw   \n",
       "\n",
       "          d_sfrom                            d_ckId d_curPage d_pageSize  \\\n",
       "0  search_unknown  d0c770bbc6e7eb1a29014d7370b5332d         0         40   \n",
       "1  search_unknown  d0c770bbc6e7eb1a29014d7370b5332d         0         40   \n",
       "2  search_unknown  d0c770bbc6e7eb1a29014d7370b5332d         0         40   \n",
       "3  search_unknown  d0c770bbc6e7eb1a29014d7370b5332d         0         40   \n",
       "4  search_unknown  d0c770bbc6e7eb1a29014d7370b5332d         0         40   \n",
       "5  search_unknown  d0c770bbc6e7eb1a29014d7370b5332d         0         40   \n",
       "\n",
       "                           d_headId curPage  \n",
       "0  d0c770bbc6e7eb1a29014d7370b5332d       1  \n",
       "1  d0c770bbc6e7eb1a29014d7370b5332d       2  \n",
       "2  d0c770bbc6e7eb1a29014d7370b5332d       3  \n",
       "3  d0c770bbc6e7eb1a29014d7370b5332d       4  \n",
       "4  d0c770bbc6e7eb1a29014d7370b5332d       1  \n",
       "5  d0c770bbc6e7eb1a29014d7370b5332d       9  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pageSize      1\n",
      "sortFlag      1\n",
      "key           1\n",
      "siTag         1\n",
      "d_sfrom       1\n",
      "d_ckId        1\n",
      "d_curPage     1\n",
      "d_pageSize    1\n",
      "d_headId      1\n",
      "curPage       5\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "from urllib.parse import urlparse, parse_qs\n",
    "import pandas as pd\n",
    "from IPython.display import display, HTML\n",
    "\n",
    "# 总体目标：输入 href_列表, 建构出参数字典\n",
    "\n",
    "# urlparse 解析后丢入数据框\n",
    "df = pd.DataFrame([ urlparse(x) for x in href_列表])\n",
    "df_qs = pd.DataFrame([{k:v[0] for k,v in parse_qs(x).items()} for x in df['query'] ])\n",
    "\n",
    "display(df)\n",
    "print(df.nunique())\n",
    "display(df_qs)\n",
    "print(df_qs.nunique())\n",
    "\n",
    "df_qs.curPage\n",
    "df_qs = df_qs.assign (curPage_int=df_qs.curPage.astype(int)) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4.构建curpage参数模板"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'pageSize': ['40'], 'sortFlag': ['°radeFlag=0'], 'key': ['网站运营'], 'siTag': ['sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw'], 'd_sfrom': ['search_unknown'], 'd_ckId': ['d0c770bbc6e7eb1a29014d7370b5332d'], 'd_curPage': ['0'], 'd_pageSize': ['40'], 'd_headId': ['d0c770bbc6e7eb1a29014d7370b5332d'], 'curPage': ['1']}\n",
      "{'2': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E7%BD%91%E7%AB%99%E8%BF%90%E8%90%A5&siTag=sTT8QS_ztTHnR804xqdRDQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d0c770bbc6e7eb1a29014d7370b5332d&d_curPage=0&d_pageSize=40&d_headId=d0c770bbc6e7eb1a29014d7370b5332d&curPage=1', '3': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E7%BD%91%E7%AB%99%E8%BF%90%E8%90%A5&siTag=sTT8QS_ztTHnR804xqdRDQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d0c770bbc6e7eb1a29014d7370b5332d&d_curPage=0&d_pageSize=40&d_headId=d0c770bbc6e7eb1a29014d7370b5332d&curPage=2', '4': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E7%BD%91%E7%AB%99%E8%BF%90%E8%90%A5&siTag=sTT8QS_ztTHnR804xqdRDQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d0c770bbc6e7eb1a29014d7370b5332d&d_curPage=0&d_pageSize=40&d_headId=d0c770bbc6e7eb1a29014d7370b5332d&curPage=3', '5': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E7%BD%91%E7%AB%99%E8%BF%90%E8%90%A5&siTag=sTT8QS_ztTHnR804xqdRDQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d0c770bbc6e7eb1a29014d7370b5332d&d_curPage=0&d_pageSize=40&d_headId=d0c770bbc6e7eb1a29014d7370b5332d&curPage=4', '下一页': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E7%BD%91%E7%AB%99%E8%BF%90%E8%90%A5&siTag=sTT8QS_ztTHnR804xqdRDQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d0c770bbc6e7eb1a29014d7370b5332d&d_curPage=0&d_pageSize=40&d_headId=d0c770bbc6e7eb1a29014d7370b5332d&curPage=1', '': '/zhaopin/?compkind=&dqs=&pubTime=&pageSize=40&salary=&compTag=&sortFlag=°radeFlag=0&compIds=&subIndustry=&jobKind=&industries=&compscale=&key=%E7%BD%91%E7%AB%99%E8%BF%90%E8%90%A5&siTag=sTT8QS_ztTHnR804xqdRDQ%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=d0c770bbc6e7eb1a29014d7370b5332d&d_curPage=0&d_pageSize=40&d_headId=d0c770bbc6e7eb1a29014d7370b5332d&curPage=9'}\n"
     ]
    }
   ],
   "source": [
    "\n",
    "def parse_url_qs_for_curPage (url):\n",
    "    six_parts = urlparse(url) \n",
    "    out = parse_qs(six_parts.query)\n",
    "    return (out)\n",
    "\n",
    "# 取一例做模板\n",
    "参数模板 = parse_url_qs_for_curPage(href_列表[0])\n",
    "print (参数模板)\n",
    "\n",
    "print (href_字典)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\n",
      "9\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{0: {'pageSize': ['40'],\n",
       "  'sortFlag': ['°radeFlag=0'],\n",
       "  'key': ['网站运营'],\n",
       "  'siTag': ['sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw'],\n",
       "  'd_sfrom': ['search_unknown'],\n",
       "  'd_ckId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'd_curPage': ['0'],\n",
       "  'd_pageSize': ['40'],\n",
       "  'd_headId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'curPage': [0],\n",
       "  'keyword': ['网站运营']},\n",
       " 1: {'pageSize': ['40'],\n",
       "  'sortFlag': ['°radeFlag=0'],\n",
       "  'key': ['网站运营'],\n",
       "  'siTag': ['sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw'],\n",
       "  'd_sfrom': ['search_unknown'],\n",
       "  'd_ckId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'd_curPage': ['0'],\n",
       "  'd_pageSize': ['40'],\n",
       "  'd_headId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'curPage': [1],\n",
       "  'keyword': ['网站运营']},\n",
       " 2: {'pageSize': ['40'],\n",
       "  'sortFlag': ['°radeFlag=0'],\n",
       "  'key': ['网站运营'],\n",
       "  'siTag': ['sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw'],\n",
       "  'd_sfrom': ['search_unknown'],\n",
       "  'd_ckId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'd_curPage': ['0'],\n",
       "  'd_pageSize': ['40'],\n",
       "  'd_headId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'curPage': [2],\n",
       "  'keyword': ['网站运营']},\n",
       " 3: {'pageSize': ['40'],\n",
       "  'sortFlag': ['°radeFlag=0'],\n",
       "  'key': ['网站运营'],\n",
       "  'siTag': ['sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw'],\n",
       "  'd_sfrom': ['search_unknown'],\n",
       "  'd_ckId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'd_curPage': ['0'],\n",
       "  'd_pageSize': ['40'],\n",
       "  'd_headId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'curPage': [3],\n",
       "  'keyword': ['网站运营']},\n",
       " 4: {'pageSize': ['40'],\n",
       "  'sortFlag': ['°radeFlag=0'],\n",
       "  'key': ['网站运营'],\n",
       "  'siTag': ['sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw'],\n",
       "  'd_sfrom': ['search_unknown'],\n",
       "  'd_ckId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'd_curPage': ['0'],\n",
       "  'd_pageSize': ['40'],\n",
       "  'd_headId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'curPage': [4],\n",
       "  'keyword': ['网站运营']},\n",
       " 5: {'pageSize': ['40'],\n",
       "  'sortFlag': ['°radeFlag=0'],\n",
       "  'key': ['网站运营'],\n",
       "  'siTag': ['sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw'],\n",
       "  'd_sfrom': ['search_unknown'],\n",
       "  'd_ckId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'd_curPage': ['0'],\n",
       "  'd_pageSize': ['40'],\n",
       "  'd_headId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'curPage': [5],\n",
       "  'keyword': ['网站运营']},\n",
       " 6: {'pageSize': ['40'],\n",
       "  'sortFlag': ['°radeFlag=0'],\n",
       "  'key': ['网站运营'],\n",
       "  'siTag': ['sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw'],\n",
       "  'd_sfrom': ['search_unknown'],\n",
       "  'd_ckId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'd_curPage': ['0'],\n",
       "  'd_pageSize': ['40'],\n",
       "  'd_headId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'curPage': [6],\n",
       "  'keyword': ['网站运营']},\n",
       " 7: {'pageSize': ['40'],\n",
       "  'sortFlag': ['°radeFlag=0'],\n",
       "  'key': ['网站运营'],\n",
       "  'siTag': ['sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw'],\n",
       "  'd_sfrom': ['search_unknown'],\n",
       "  'd_ckId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'd_curPage': ['0'],\n",
       "  'd_pageSize': ['40'],\n",
       "  'd_headId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'curPage': [7],\n",
       "  'keyword': ['网站运营']},\n",
       " 8: {'pageSize': ['40'],\n",
       "  'sortFlag': ['°radeFlag=0'],\n",
       "  'key': ['网站运营'],\n",
       "  'siTag': ['sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw'],\n",
       "  'd_sfrom': ['search_unknown'],\n",
       "  'd_ckId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'd_curPage': ['0'],\n",
       "  'd_pageSize': ['40'],\n",
       "  'd_headId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'curPage': [8],\n",
       "  'keyword': ['网站运营']},\n",
       " 9: {'pageSize': ['40'],\n",
       "  'sortFlag': ['°radeFlag=0'],\n",
       "  'key': ['网站运营'],\n",
       "  'siTag': ['sTT8QS_ztTHnR804xqdRDQ~fA9rXquZc5IkJpXC-Ycixw'],\n",
       "  'd_sfrom': ['search_unknown'],\n",
       "  'd_ckId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'd_curPage': ['0'],\n",
       "  'd_pageSize': ['40'],\n",
       "  'd_headId': ['d0c770bbc6e7eb1a29014d7370b5332d'],\n",
       "  'curPage': [9],\n",
       "  'keyword': ['网站运营']}}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def 参数模板生成(keyword, curPage):\n",
    "    参数 = 参数模板.copy()\n",
    "    参数['curPage'] = curPage\n",
    "    参数['keyword'] = keyword\n",
    "    return (参数)\n",
    "\n",
    "参数_keyword_网站运营_curPage = { \n",
    "    i:参数模板生成(curPage = [i], \\\n",
    "                  keyword = ['网站运营']) \\\n",
    "    for i,v in href_字典.items()\\\n",
    "    }\n",
    "\n",
    "\n",
    "print (df_qs.curPage_int.min()) \n",
    "print (df_qs.curPage_int.max()) \n",
    "\n",
    "\n",
    "参数_keyword_网站运营_curPage = { \n",
    "    i:参数模板生成(curPage = [i], \\\n",
    "                  keyword = ['网站运营']) \\\n",
    "    for i in range(0,df_qs.curPage_int.max()+1)\\\n",
    "    }\n",
    "参数_keyword_网站运营_curPage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# requests_liepin\n",
    "session = HTMLSession()\n",
    "\n",
    "def requests_liepin( url, params):\n",
    "    r = session.get( url , params = payload)\n",
    "\n",
    "    # 先取特定元素, 精准打击其子后辈\n",
    "    主要元素 = r.html.xpath( '//ul[@class=\"sojob-list\"]/li')\n",
    "\n",
    "    # 作为xpath字典，键为我要抓的牛肉名称，值为xpath\n",
    "    dict_xpaths={ \n",
    "        'text': {\n",
    "            'edu':      '//div[contains(@class,\"job-info\")]/p/span[@class=\"edu\"]',\n",
    "            '经验':      '//div[contains(@class,\"job-info\")]/p/span[@class=\"edu\"]/following-sibling::span',\n",
    "            '薪水':    '//div[contains(@class,\"job-info\")]/p/span[@class=\"text-warning\"]', \n",
    "            '时间':    '//div[contains(@class,\"job-info\")]/p/time/@title', \n",
    "            '职称':    '//div[contains(@class,\"job-info\")]/h3/a', \n",
    "            '公司地点': '//div[contains(@class,\"job-info\")]/p/a',\n",
    "            '公司名称': '//div[contains(@class,\"sojob-item-main\")]//p[@class=\"company-name\"]/a', \n",
    "        },\n",
    "        'text_content': {\n",
    "        },\n",
    "        'href': {\n",
    "            '链结':    '//div[contains(@class,\"job-info\")]/h3/a', \n",
    "            '公司URL': '//div[contains(@class,\"sojob-item-main\")]//p[@class=\"company-name\"]/a', \n",
    "        }\n",
    "    }\n",
    "\n",
    "    def get_e_text_content(_xpath_):\n",
    "        # 高级列表推导\n",
    "        暂存结果 = [e.xpath(_xpath_)[0].lxml.text_content() for e in 主要元素]\n",
    "        return(暂存结果)\n",
    "\n",
    "    def get_e_text(_xpath_):\n",
    "        # 高级列表推导\n",
    "        暂存结果 = [\"\".join([x.strip() if type(x) is str else x.text.strip() for x in e.xpath(_xpath_)]) for e in 主要元素]\n",
    "        return(暂存结果)\n",
    "\n",
    "    def get_e_href(_xpath_):\n",
    "        # 高级列表推导\n",
    "        暂存结果 = [list(e.xpath(_xpath_, first=True).absolute_links)[0] \\\n",
    "                   if len(e.xpath(_xpath_, first=True).absolute_links) >= 1  \\\n",
    "                   else \"\" for e in 主要元素]\n",
    "        return(暂存结果)\n",
    "\n",
    "    # 只对主要元素下进行.xpath取值\n",
    "    数据字典 = dict()\n",
    "\n",
    "    数据字典 = {k:get_e_text_content(v) for k,v in dict_xpaths['text_content'].items()}\n",
    "    数据字典.update({k:get_e_text(v) for k,v in dict_xpaths['text'].items()})\n",
    "    数据字典.update({k:get_e_href(v) for k,v in dict_xpaths['href'].items()})\n",
    "\n",
    "    数据 = pd.DataFrame(数据字典)\n",
    "    #数据.to_excel(\"Web数据挖掘_网站运营2_liepin.xlsx\", sheet_name=\"搜查结果\")\n",
    "    return (数据)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5.放慢爬取速度"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 50.1 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "import time\n",
    "from random import random\n",
    "\n",
    "url = \"https://www.liepin.com/zhaopin/\"\n",
    "\n",
    "list_df = list()\n",
    "for k,v in 参数_keyword_网站运营_curPage.items():\n",
    "    payload = v\n",
    "    df = requests_liepin( url, params = payload)\n",
    "    time.sleep(3+4*random())  #放慢脚步 3-7秒, 平均约5秒\n",
    "    df = df.assign (curPage = k)  # 区分  curPage\n",
    "    list_df.append(df)\n",
    "\n",
    "df_all = pd.concat(list_df).reset_index()\n",
    "df_all.index.name = '序'\n",
    "\n",
    "# 上周C-4   输出\n",
    "df_all.to_excel(\"Web数据挖掘_网站运营_翻页.xlsx\",\\\n",
    "                sheet_name=\"网站运营\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "网站运营 10\n",
      "新媒体 10\n",
      "Wall time: 1min 57s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# B-3 多个页面+多个关键词\n",
    "import time\n",
    "from random import random\n",
    "\n",
    "url = \"https://www.liepin.com/zhaopin/\"\n",
    "xpath_翻页a = '//div[@class=\"pagerbar\"]/a[starts-with(@href,\"/zhaopin\")]'\n",
    "\n",
    "keywords = ['网站运营','新媒体']\n",
    "list_df = list()\n",
    "\n",
    "## 第一页试探有多长的页面\n",
    "for key in keywords:\n",
    "    payload = 参数模板生成(keyword=[key], curPage=['0'])\n",
    "    df = requests_liepin( url, params = payload)\n",
    "    href_列表 = [x.xpath('//@href')[0] for x in r.html.xpath(xpath_翻页a)]\n",
    "    df = pd.DataFrame([ urlparse(x) for x in href_列表])\n",
    "    df_qs = pd.DataFrame([{k:v[0] for k,v in parse_qs(x).items()} for x in df['query'] ])\n",
    "    df_qs = df_qs.assign (curPage_int=df_qs.curPage.astype(int)) # 变成整数\n",
    "    长度 = df_qs.curPage_int.max()+1\n",
    "    参数_keyword_X_curPage = { \n",
    "        i:参数模板生成(curPage = [i], \\\n",
    "                      keyword = [key]) \\\n",
    "        for i in range(0,长度)\\\n",
    "        }\n",
    "    #print (参数_keyword_X_curPage)\n",
    "    print (key,长度)\n",
    "    \n",
    "    for k,v in 参数_keyword_X_curPage.items():\n",
    "        payload = v\n",
    "        df = requests_liepin( url, params = payload)\n",
    "        time.sleep(3+4*random())  #放慢脚步 3-7秒, 平均约5秒\n",
    "        df = df.assign (keyword = key)  # 区分  keyword    \n",
    "        df = df.assign (curPage = k)  # 区分  curPage    \n",
    "        list_df.append(df)\n",
    "        \n",
    "df_all = pd.concat(list_df).reset_index()\n",
    "df_all.index.name = '序'\n",
    "\n",
    "df_all.to_excel(\"双关键词_Web数据挖掘_网站运营_liepin_翻页.xlsx\",\\\n",
    "                sheet_name=\"_\".join(keywords))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "网站运营 10\n",
      "广州 10\n",
      "新媒体 10\n",
      "Wall time: 2min 52s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# B-3 多个页面+多个关键词\n",
    "import time\n",
    "from random import random\n",
    "\n",
    "url = \"https://www.liepin.com/zhaopin/\"\n",
    "xpath_翻页a = '//div[@class=\"pagerbar\"]/a[starts-with(@href,\"/zhaopin\")]'\n",
    "\n",
    "keywords = ['网站运营','广州','新媒体']\n",
    "list_df = list()\n",
    "\n",
    "## 第一页试探有多长的页面\n",
    "for key in keywords:\n",
    "    payload = 参数模板生成(keyword=[key], curPage=['0'])\n",
    "    df = requests_liepin( url, params = payload)\n",
    "    href_列表 = [x.xpath('//@href')[0] for x in r.html.xpath(xpath_翻页a)]\n",
    "    df = pd.DataFrame([ urlparse(x) for x in href_列表])\n",
    "    df_qs = pd.DataFrame([{k:v[0] for k,v in parse_qs(x).items()} for x in df['query'] ])\n",
    "    df_qs = df_qs.assign (curPage_int=df_qs.curPage.astype(int)) # 变成整数\n",
    "    长度 = df_qs.curPage_int.max()+1\n",
    "    参数_keyword_X_curPage = { \n",
    "        i:参数模板生成(curPage = [i], \\\n",
    "                      keyword = [key]) \\\n",
    "        for i in range(0,长度)\\\n",
    "        }\n",
    "    #print (参数_keyword_X_curPage)\n",
    "    print (key,长度)\n",
    "    \n",
    "    for k,v in 参数_keyword_X_curPage.items():\n",
    "        payload = v\n",
    "        df = requests_liepin( url, params = payload)\n",
    "        time.sleep(3+4*random())  #放慢脚步 3-7秒, 平均约5秒\n",
    "        df = df.assign (keyword = key)  # 区分  keyword    \n",
    "        df = df.assign (curPage = k)  # 区分  curPage    \n",
    "        list_df.append(df)\n",
    "        \n",
    "df_all = pd.concat(list_df).reset_index()\n",
    "df_all.index.name = '序'\n",
    "\n",
    "df_all.to_excel(\"职位和城市_Web数据挖掘_网站运营_liepin_翻页.xlsx\",\\\n",
    "                sheet_name=\"_\".join(keywords))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
